Project Part 1

Group 9

Authors
Affiliation

Thimoté Dupuch

University of Twente

Joris van Lierop

University of Twente

Jurre van Sijpveld

University of Twente

Published

April 9, 2025

Loading libraries

library(dplyr)
library(forcats)
library(vtable)
library(ggplot2)
library(plotly) # Optional
library(broom)
library(lmtest)
library(mice)

Loading dataset

dataset <- read.csv("SMARTc.csv", sep = ";") # Without missing values

Re-encode the categorical variables

dataset <- mutate(dataset,
  EVENT = factor(EVENT),
  EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),

  SEX = factor(SEX),
  SEX = fct_recode(SEX, "male" = "1", "female" = "2"),

  DIABETES = factor(DIABETES),
  DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),

  SMOKING = factor(SMOKING),
  SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),

  alcohol = factor(alcohol),
  alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),

  CEREBRAL = factor(CEREBRAL),
  CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),

  CARDIAC = factor(CARDIAC),
  CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),

  AAA = factor(AAA),
  AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),

  PERIPH = factor(PERIPH),
  PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),

  albumin = factor(albumin),
  albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),

  STENOSIS = factor(STENOSIS),
  STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),

)

Description of the dataset and table of variables

The dataset is about cardiovascular health. It contains two outcomes : EVENT and TEVENT, the presence of cardiovascular events and the number of days the patient is in study until the event occurs. The dataset contains many variables, some of them are categorical and some of them are numerical. It covers patient descriptives, classical risk factors, previous symptomatic atherosclerosis, and markers of atherosclerosis.

sumtable(dataset, out = "return", add.median = TRUE)

Association between variables and the outcome

avg_event_proportion <- mean(as.numeric(dataset$EVENT == "yes"))
bar_plot <- ggplot(dataset, aes(x = SMOKING, fill = EVENT)) +
    geom_bar(position = "fill") +
    geom_hline(yintercept = avg_event_proportion, linetype = "dashed") +
    labs(
        title = "Cardiovascular Event by Smoking Status",
        x = "Smoking Status", y = "Proportion (-- : Average)",
        fill = "Cardiovascular Event"
    )

#ggplotly(bar_plot,width = 500, height = 600)
bar_plot

This bar plot shows the proportion of cardiovascular events by smoking status. The dashed line represents the average proportion of cardiovascular events in the dataset. The proportion of cardiovascular events is higher for former smokers, even higher than the average.

boxplot <- ggplot(dataset, aes(x = as.factor(EVENT), y = AGE)) +
  geom_boxplot(fill = "lightblue") +
  labs(x = "EVENT", y = "AGE", title = "Boxplot of age by event")

ggplotly(boxplot)

Logistic regression model

fit <- glm(EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
    HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
    alcohol + albumin, data = dataset, family = "binomial")
tidy(fit)

The logistic regression model shows that the variables associated with the lowest p-values are HISTCAR2, HDL and AGE. The other variables have higher p-values, which means they are less associated with the outcome.

anova(fit)
conf_intervals <- confint(fit, trace = FALSE, test = c("LRT", "Rao"))
Waiting for profiling to be done...
conf_intervals_df <- as.data.frame(conf_intervals)
conf_intervals_df
conf_intervals_df$Variable <- rownames(conf_intervals_df)
colnames(conf_intervals_df) <- c("Lower", "Upper", "Variable")
conf_intervals_df <- conf_intervals_df[conf_intervals_df$Variable != "(Intercept)", ]

conf_intervals_plot <- ggplot(conf_intervals_df, aes(x = Variable, ymin = Lower, ymax = Upper)) +
    geom_errorbar(width = 0.2) +
    geom_point(aes(y = (Lower + Upper) / 2)) +
    coord_flip() +
    labs(
        title = "Confidence Intervals for Logistic Regression Coefficients",
        x = "Variable",
        y = "Confidence Interval"
    )

conf_intervals_plot